import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import re
/Users/lettyuy/opt/anaconda3/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.0
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
df = pd.read_csv("Hot 100.csv")
df['chart_date'] = pd.to_datetime(df['chart_date'])
df['chart_debut'] = pd.to_datetime(df['chart_debut'])
df['chart_year'] = df['chart_date'].dt.year
df.head()
| chart_position | chart_date | song | performer | song_id | instance | time_on_chart | consecutive_weeks | previous_week | peak_position | worst_position | chart_debut | chart_url | chart_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 84 | 1990-05-05 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 1 | NaN | NaN | 84 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
| 1 | 78 | 1990-05-12 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 2 | 1.0 | 84.0 | 78 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
| 2 | 68 | 1990-05-19 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 3 | 2.0 | 78.0 | 68 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
| 3 | 60 | 1990-05-26 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 4 | 3.0 | 68.0 | 60 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
| 4 | 58 | 1990-06-02 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 5 | 4.0 | 60.0 | 58 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
#df['performer'] = df['performer'].str.split(',|&| and | featuring | feat\. | ft\. ')
#df = df.explode('performer')
#df['individual_artist'] = df['performer'].str.split(',|&| and | featuring | feat\. | ft\. ')
#df = df.explode('individual_artist')
df['individual_artist'] = df['performer'].apply(lambda x: re.split(r',|&| and | featuring | feat\. | ft\. ', x, flags=re.IGNORECASE))
df = df.explode('individual_artist')
df['individual_artist'] = df['individual_artist'].str.strip()
avg_chart_positions = df.groupby(['song', 'individual_artist'])['chart_position'].mean().round().astype(int).reset_index()
avg_chart_positions = avg_chart_positions.rename(columns={'chart_position': 'avg_chart_position'})
df = pd.merge(df, avg_chart_positions, on=['song', 'individual_artist'], how='left')
df_at_1 = df[df['chart_position'] == 1]
unique_songs_at_1 = df_at_1.groupby(['individual_artist', 'song']).size().reset_index().rename(columns={0: 'count'})
individual_artist_hits = unique_songs_at_1.groupby('individual_artist').size()
one_hit_artists_list = individual_artist_hits[individual_artist_hits == 1].index.tolist()
df_one_hit_wonders = df[(df['chart_position'] == 1) & df['individual_artist'].isin(one_hit_artists_list)]
df_one_hit_wonders = df_one_hit_wonders.drop_duplicates(subset=['song', 'individual_artist'])
artists_with_staying_power_list = individual_artist_hits[individual_artist_hits >= 3].index.tolist()
df_artists_with_staying_power = df[(df['chart_position'] == 1) & df['individual_artist'].isin(artists_with_staying_power_list)]
df_artists_with_staying_power = df_artists_with_staying_power.drop_duplicates(subset=['song', 'individual_artist'])
def get_top_10_per_year(group):
return group.nlargest(10, 'consecutive_weeks')
df_one_hit_wonders['Source'] = 'One Hit Wonders'
df_artists_with_staying_power['Source'] = 'Artists with Staying Power'
top_10_one_hit_wonders_yearly = df_one_hit_wonders.groupby('chart_year').apply(get_top_10_per_year).reset_index(drop=True)
top_10_staying_power_yearly = df_artists_with_staying_power.groupby('chart_year').apply(get_top_10_per_year).reset_index(drop=True)
top_10_combined_yearly = pd.concat([top_10_one_hit_wonders_yearly, top_10_staying_power_yearly])
top_10_combined_yearly.rename(columns={'time_on_chart': 'Time on Chart', 'avg_chart_position': 'Average Chart Position', 'song': 'Song'}, inplace=True)
all_years = top_10_combined_yearly['chart_year'].unique()
all_sources = ["One Hit Wonders", "Artists with Staying Power"]
expanded_data = []
for year in all_years:
for Source in all_sources:
subset = top_10_combined_yearly[(top_10_combined_yearly['chart_year'] == year) & (top_10_combined_yearly['Source'] == Source)]
if subset.empty:
expanded_data.append({
'chart_year': year,
'Source': Source,
'Average Chart Position': np.nan,
'Time on Chart': np.nan,
'individual_artist': f'Placeholder {Source} {year}'
})
else:
expanded_data.extend(subset.to_dict('records'))
expanded_df = pd.DataFrame(expanded_data)
fig = px.scatter(
expanded_df,
x="Average Chart Position",
y="Time on Chart",
animation_frame="chart_year",
animation_group="individual_artist",
hover_name="individual_artist",
hover_data={"Song": True, "Source": False, "chart_year": False},
color="Source",
size_max=55,
range_x=[top_10_combined_yearly['Average Chart Position'].max(), top_10_combined_yearly['Average Chart Position'].min()],
range_y=[0, top_10_combined_yearly['Time on Chart'].max()]
)
x_mid = 30
y_mid = 30
fig.add_shape(
go.layout.Shape(
type="line",
x0=x_mid,
x1=x_mid,
y0=0,
y1=top_10_combined_yearly['Time on Chart'].max(),
line=dict(color="Black", dash="dash", width=0.5)
)
)
fig.add_shape(
go.layout.Shape(
type="line",
x0=top_10_combined_yearly['Average Chart Position'].max(),
x1=top_10_combined_yearly['Average Chart Position'].min(),
y0=y_mid,
y1=y_mid,
line=dict(color="Black", dash="dash", width=0.5)
)
)
fig.add_annotation(
text="Lower Rank, long duration",
x=x_mid + (x_mid / 2),
y=y_mid + (y_mid / 2),
showarrow=False
)
fig.add_annotation(
text="High rank, long duration",
x=x_mid - (x_mid / 2),
y=y_mid + (y_mid / 2),
showarrow=False
)
fig.add_annotation(
text="High rank, short duration",
x=x_mid - (x_mid / 2),
y=y_mid - (y_mid / 2),
showarrow=False
)
fig.add_annotation(
text="Lower rank, short duration",
x=x_mid + (x_mid / 2),
y=y_mid - (y_mid / 2),
showarrow=False
)
fig.update_layout(
xaxis_title="Average Chart Position",
yaxis_title="Total Weeks at #1"
)
fig.show()